Load R packages

library(tidyverse) ; library(reshape2) ; library(glue) ; library(plotly) ; library(plotlyutils)
library(RColorBrewer) ; library(viridis) ; require(gridExtra) ; library(GGally) ; library(ggExtra) ; library(ggpubr)
library(biomaRt) ; library(DESeq2) ; library(sva) ; library(WGCNA) ; library(vsn)
library(dendextend) ; library(expss)
library(knitr) ; library(kableExtra)
library(GEOquery) ;  library(limma)


Preparing the data


Dataset downloded from two different places:


# LOAD DATA

# Expression data downliaded from GEMMA (https://gemma.msl.ubc.ca/expressionExperiment/showExpressionExperiment.html?id=11805)
datExpr = read.delim('./../Data/inputData/11805_GSE102741_expmat.data.txt.gz', comment.char='#')

datGenes_original = datExpr %>% dplyr::select(Probe, Sequence, GeneSymbol, GeneName, GemmaId, NCBIid) %>% 
                    dplyr::rename('entrezgene' = NCBIid, 'hgnc_symbol' = GeneSymbol) %>% 
                    mutate(entrezgene = entrezgene %>% as.character %>% as.integer)
datExpr = datExpr %>% dplyr::select(-c(Probe, Sequence, GeneSymbol, GeneName, GemmaId, NCBIid))
colnames(datExpr) = sapply(colnames(datExpr), function(x) strsplit(x, '\\.')[[1]][3]) %>% unname

# Metadata downloaded from GEO
GEO_data = getGEO('GSE102741', destdir='./../Data/inputData')[[1]]

datMeta = GEO_data@phenoData@data %>% 
          mutate(Diagnosis = factor(ifelse(grepl('control', characteristics_ch1), 'CTL', 'ASD'), 
                                    levels = c('CTL','ASD')), 
                 Age = substring(characteristics_ch1.4, 6) %>% as.numeric %>% round, 
                 Sex = `Sex:ch1`, 
                 Sample_ID = description, 
                 Ethnicity = substring(characteristics_ch1.6, 7), 
                 title = gsub(' ', '', title)) %>% 
          dplyr::rename('Subject_ID' = title) %>% 
          dplyr::select(Subject_ID, geo_accession, Sample_ID, Diagnosis, Age, Sex, Ethnicity)
datMeta = datMeta[match(colnames(datExpr), datMeta$Subject_ID),]

# SFARI Genes
SFARI_genes = read_csv('./../../SFARI/Data/SFARI_genes_01-03-2020_w_ensembl_IDs.csv')
SFARI_genes = SFARI_genes[!duplicated(SFARI_genes$ID) & !is.na(SFARI_genes$ID),]

# NCBI biotype annotation
NCBI_biotype = read.csv('./../../NCBI/Data/gene_biotype_info.csv') %>% 
               dplyr::rename('ensembl_gene_id'=Ensembl_gene_identifier, 'gene_biotype'=type_of_gene, 
                             'hgnc_symbol'=Symbol) %>% 
               mutate(gene_biotype = ifelse(gene_biotype=='protein-coding','protein_coding', gene_biotype))

Check sample composition


Dataset consists of 52 samples (13 ASD and 39 Controls), all extracted from the DLPFC of the brain

Sequenced using Illumina’s HiSeq 2000 (Gupta used the same, Gandal used Illumina HiSeq 2500, they are compatible)


The dataset includes 25981 genes from 52 samples belonging to 52 different subjects


Counts distribution: The data has already been preprocessed, so we have relatively balanced values, centered close to 0

counts = datExpr %>% melt

count_distr = data.frame('Statistic' = c('Min', '1st Quartile', 'Median', 'Mean', '3rd Quartile', 'Max'),
                         'Values' = c(min(counts$value), quantile(counts$value, probs = c(.25, .5)) %>% unname,
                                      mean(counts$value), quantile(counts$value, probs = c(.75)) %>% unname,
                                      max(counts$value)))

count_distr %>% kable(digits = 2, format.args = list(scientific = FALSE)) %>% kable_styling(full_width = F)
Statistic Values
Min -5.77
1st Quartile -1.97
Median 1.06
Mean 0.89
3rd Quartile 3.81
Max 16.47
counts %>% ggplot(aes(value)) + geom_density(color = '#993399', fill = '#993399', alpha = 0.3) + theme_minimal()

rm(counts, count_distr)




2.2.1 Gene annotation


Biotype

I was originally running this with the feb2014 version of BioMart because that’s the one that Gandal used (and it finds all of the Ensembl IDs, which other versions don’t), but it has some outdated biotype annotations, to fix this I’ll obtain all the information except the biotype label from BioMart in the same way as it had been done before, and then I’ll add the most current biotype label using information from NCBI’s website and then from BioMart in the following way:

  1. Use BioMart to run a query with the original feb2014 version using the Ensembl IDs as keys to obtain all the information except the biotype labels

  2. Annotate genes with Biotype labels:

    2.1 Use the NCBI annotations downloaded from NCBI’s website and processed in NCBI/RMarkdowns/clean_data.html (there is information for only 26K genes, so some genes will remain unlabelled)

    2.2 Use the current version (jan2020) to obtain the biotype annotations using the Ensembl ID as keys (some genes don’t return a match)

    2.3 For the genes that didn’t return a match, use the current version (jan2020) to obtain the biotype annotations using the gene name as keys (17 genes return multiple labels)

    2.4 For the genes that returned multiple labels, use the feb2014 version with the Ensembl IDs as keys


labels_source = data.frame('source' = c('NCBI', 'BioMart2020_byID', 'BioMart2020_byGene', 'BioMart2014'),
                                      'n_matches' = rep(0,4))

########################################################################################
# 1. Query archive version

getinfo = c('entrezgene','ensembl_gene_id','external_gene_id','chromosome_name','start_position',
            'end_position','strand')
mart = useMart(biomart = 'ENSEMBL_MART_ENSEMBL', dataset = 'hsapiens_gene_ensembl', 
               host = 'feb2014.archive.ensembl.org')
datGenes = getBM(attributes = getinfo, filters=c('entrezgene'), values = datGenes_original$entrezgene, mart=mart) %>% 
           right_join(datGenes_original %>% dplyr::select(entrezgene, hgnc_symbol), by = 'entrezgene')
datGenes$length = datGenes$end_position - datGenes$start_position

# ! There were some multiple matches between entrezgene and ensemblID
cat(paste0('1. ', sum(is.na(datGenes$start_position)), '/', nrow(datGenes),
             ' Ensembl IDs weren\'t found in the feb2014 version of BioMart'))
## 1. 7504/29332 Ensembl IDs weren't found in the feb2014 version of BioMart
########################################################################################
########################################################################################
# 2. Get Biotype Labels

cat('2. Add biotype information')
## 2. Add biotype information
########################################################################################
# 2.1 Add NCBI annotations
datGenes = datGenes %>% left_join(NCBI_biotype, by=c('ensembl_gene_id','hgnc_symbol'))

cat(paste0('2.1 ' , sum(is.na(datGenes$gene_biotype)), '/', nrow(datGenes),
             ' Ensembl IDs weren\'t found in the NCBI database'))
## 2.1 12176/29332 Ensembl IDs weren't found in the NCBI database
labels_source$n_matches[1] = sum(!is.na(datGenes$gene_biotype))

########################################################################################
# 2.2 Query current BioMart version for gene_biotype using Ensembl ID as key

getinfo = c('ensembl_gene_id','gene_biotype')
mart = useMart(biomart = 'ENSEMBL_MART_ENSEMBL', dataset = 'hsapiens_gene_ensembl',
               host = 'jan2020.archive.ensembl.org')
datGenes_biotype = getBM(attributes = getinfo, filters = c('ensembl_gene_id'), mart=mart, 
                         values = datGenes$ensembl_gene_id[is.na(datGenes$gene_biotype)])

cat(paste0('2.2 ' , sum(is.na(datGenes$gene_biotype))-nrow(datGenes_biotype), '/', 
           sum(is.na(datGenes$gene_biotype)),
           ' Ensembl IDs weren\'t found in the jan2020 version of BioMart when querying by Ensembl ID'))
## 2.2 9704/12176 Ensembl IDs weren't found in the jan2020 version of BioMart when querying by Ensembl ID
# Add new gene_biotype info to datGenes
datGenes = datGenes %>% left_join(datGenes_biotype, by='ensembl_gene_id') %>%
           mutate(gene_biotype = coalesce(as.character(gene_biotype.x), gene_biotype.y)) %>%
           dplyr::select(-gene_biotype.x, -gene_biotype.y)

labels_source$n_matches[2] = sum(!is.na(datGenes$gene_biotype)) - labels_source$n_matches[1]

########################################################################################
# 3. Query current BioMart version for gene_biotype using gene symbol as key

missing_genes = unique(datGenes$hgnc_symbol[is.na(datGenes$gene_biotype)])
getinfo = c('hgnc_symbol','gene_biotype')
datGenes_biotype_by_gene = getBM(attributes=getinfo, filters=c('hgnc_symbol'), mart=mart,
                                 values=missing_genes)

cat(paste0('2.3 ', length(missing_genes)-length(unique(datGenes_biotype_by_gene$hgnc_symbol)),'/',
           length(missing_genes),
           ' genes weren\'t found in the current BioMart version when querying by gene name'))
## 2.3 6490/8998 genes weren't found in the current BioMart version when querying by gene name
dups = unique(datGenes_biotype_by_gene$hgnc_symbol[duplicated(datGenes_biotype_by_gene$hgnc_symbol)])
cat(paste0('    ', length(dups), ' genes returned multiple labels (these won\'t be added)'))
##     14 genes returned multiple labels (these won't be added)
# Update information
datGenes_biotype_by_gene = datGenes_biotype_by_gene %>% filter(!hgnc_symbol %in% dups)
datGenes = datGenes %>% left_join(datGenes_biotype_by_gene, by='hgnc_symbol') %>% 
            mutate(gene_biotype = coalesce(gene_biotype.x, gene_biotype.y)) %>%
            dplyr::select(-gene_biotype.x, -gene_biotype.y)

labels_source$n_matches[3] = sum(!is.na(datGenes$gene_biotype)) - sum(labels_source$n_matches)

########################################################################################
# 4. Query feb2014 BioMart version for the missing biotypes

missing_ensembl_ids = unique(datGenes$ensembl_gene_id[is.na(datGenes$gene_biotype)])

getinfo = c('ensembl_gene_id','gene_biotype')
mart = useMart(biomart = 'ENSEMBL_MART_ENSEMBL', dataset = 'hsapiens_gene_ensembl', 
               host = 'feb2014.archive.ensembl.org')
datGenes_biotype_archive = getBM(attributes = getinfo, filters=c('ensembl_gene_id'), 
                                 values = missing_ensembl_ids, mart=mart)

cat(paste0('2.4 ', length(missing_ensembl_ids)-nrow(datGenes_biotype_archive),'/',length(missing_ensembl_ids),
             ' genes weren\'t found in the feb2014 BioMart version when querying by Ensembl ID'))
## 2.4 1/170 genes weren't found in the feb2014 BioMart version when querying by Ensembl ID
# Update information
datGenes = datGenes %>% left_join(datGenes_biotype_archive, by='ensembl_gene_id') %>% 
            mutate(gene_biotype = coalesce(gene_biotype.x, gene_biotype.y)) %>%
            dplyr::select(-gene_biotype.x, -gene_biotype.y)

labels_source$n_matches[4] = sum(!is.na(datGenes$gene_biotype)) - sum(labels_source$n_matches)

########################################################################################
# Plot results

labels_source = labels_source %>% mutate(x = 1, percentage = round(100*n_matches/sum(n_matches),1))

p = labels_source %>% ggplot(aes(x, percentage, fill=source)) + geom_bar(position='stack', stat='identity') +
    theme_minimal() + coord_flip() + theme(legend.position='bottom', axis.title.y=element_blank(),
    axis.text.y=element_blank(), axis.ticks.y=element_blank())

ggplotly(p + theme(legend.position='none'))
as_ggplot(get_legend(p))

########################################################################################
# Remove repeated entries by entrezgene

# Remove ensembl IDS with 'LRG' notation
datGenes = datGenes %>% filter(!grepl('LRG', ensembl_gene_id) & !is.na(entrezgene)) %>% 
            distinct(entrezgene, .keep_all = TRUE)

########################################################################################
# Reorder rows to match datExpr
datGenes = datGenes[match(datGenes_original$entrezgene, datGenes$entrezgene),]


rm(getinfo, mart, datGenes_biotype, datGenes_biotype_by_gene, datGenes_biotype_archive,
   dups, missing_ensembl_ids, missing_genes, labels_source, p)



Neuronal function

The neuronal function is obtained from the Gene Ontology annotations for each gene, defining each gene that contains the substring ‘neuron’ as having a neuronal function. The pipeline to process and clean each gene’s GO annotations can be found in ./../Data/inputData/genes_GO_annotations.csv

GO_annotations = read.csv('./../Data/inputData/genes_GO_annotations.csv')

GO_neuronal = GO_annotations %>% filter(grepl('neuro', go_term)) %>% 
              mutate('ID'=as.character(ensembl_gene_id)) %>% 
              dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>% mutate('Neuronal'=1)

rm(GO_annotations)




2.2.2 Filtering



Checking how many SFARI genes are in the dataset

df = SFARI_genes %>% dplyr::select(-gene_biotype) %>% inner_join(datGenes, by=c('ID'='ensembl_gene_id'))
n_SFARI = df[['gene-symbol']] %>% unique %>% length

Considering all genes, this dataset contains 786 of the 912 SFARI genes

1.- Remove rows that don’t correspond to genes

This dataset doesn’t have non-gene rows, but it does have missing gene_biotype information

to_keep = !is.na(datGenes$gene_biotype)

datGenes = datGenes[to_keep,]
datExpr = datExpr[to_keep,]


Removed 6335 ‘genes’, 19646 remainin




2. Keep only protein coding genes

81% of the genes are protein coding genes

datGenes$gene_biotype %>% table %>% sort(decreasing=TRUE) %>% kable(caption='Biotypes of genes in dataset') %>%
                          kable_styling(full_width = F)
Biotypes of genes in dataset
. Freq
protein_coding 15977
1 1454
lncRNA 1330
3 261
transcribed_unprocessed_pseudogene 194
processed_pseudogene 122
unprocessed_pseudogene 79
6 47
transcribed_processed_pseudogene 35
transcribed_unitary_pseudogene 30
lincRNA 22
snoRNA 15
antisense 14
pseudogene 12
processed_transcript 9
7 8
TEC 8
polymorphic_pseudogene 6
misc_RNA 5
rRNA 5
snRNA 5
unitary_pseudogene 3
miRNA 1
scaRNA 1
sense_intronic 1
TR_C_gene 1
translated_unprocessed_pseudogene 1

Most of the non-protein coding genes have very low levels of expression

plot_data = data.frame('ID' = rownames(datExpr), 'MeanExpr' = apply(datExpr, 1, mean),
                       'ProteinCoding' = datGenes$gene_biotype=='protein_coding')
ggplotly(plot_data %>% ggplot(aes(MeanExpr, fill=ProteinCoding, color=ProteinCoding)) + 
         geom_density(alpha=0.5) + theme_minimal())
rm(plot_data)
df = SFARI_genes %>% dplyr::select(-gene_biotype) %>% inner_join(datGenes, by=c('ID'='ensembl_gene_id'))

Filtering protein coding genes, we are left with 783 SFARI Genes (we lose 3 genes)

Note: The gene name for Ensembl ID ENSG00000187951 is wrong, it should be AC091057.1 instead of ARHGAP11B, but the biotype is right, so it would still be filtered out

n_SFARI = df[['gene-symbol']][df$gene_biotype=='protein_coding'] %>% unique %>% length

df %>% filter(!`gene-symbol` %in% df$`gene-symbol`[df$gene_biotype=='protein_coding']) %>% 
       dplyr::select(ID, `gene-symbol`, `gene-score`, gene_biotype, syndromic, `number-of-reports`) %>% 
       kable(caption='Lost Genes')  %>% kable_styling(full_width = F)
Lost Genes
ID gene-symbol gene-score gene_biotype syndromic number-of-reports
ENSG00000187951 ARHGAP11B 3 3 0 2
ENSG00000187951 ARHGAP11B 3 lncRNA 0 2
ENSG00000224639 C4B 3 lncRNA 0 6
ENSG00000233067 PTCHD1-AS 2 lncRNA 0 3
rm(df)
to_keep = datGenes$gene_biotype=='protein_coding'
datExpr = datExpr %>% filter(to_keep)
datGenes = datGenes %>% filter(to_keep)
rownames(datExpr) = datGenes$entrezgene
rownames(datGenes) = datGenes$entrezgene


Removed 3669 genes. 15977 remaining




3. Remove genes with low levels of expression

This seems to have already been done in the original preprocessing of the data, so I won’t do it again


4. Remove outlier samples

Using node connectivity as a distance measure, normalising it and filtering out genes farther away than 2 standard deviations from the left (lower connectivity than average, not higher)

\(\qquad\) - Gandal uses the formula \(s_{ij}=\frac{1+bw(i,j)}{2}\) to convert all the weights to positive values, but I used \(s_{ij}=|bw(i,j)|\) instead because I think it makes more sense. In the end it doesn’t matter because they select as outliers the same six samples

\(\qquad\) - All the outlier samples belong to the ASD group. Apart from this they don’t seem to have any characterstic in common (different subjects, extraction batches, brain lobes, age, PMI), except for Sex (but this is probably just because of the sex bias in the dataset)

absadj = datExpr %>% bicor %>% abs
netsummary = fundamentalNetworkConcepts(absadj)
ku = netsummary$Connectivity
z.ku = (ku-mean(ku))/sqrt(var(ku))

plot_data = data.frame('sample'=1:length(z.ku), 'distance'=z.ku, 'Sample_ID'=datMeta$Sample_ID, 
                       'Subject_ID'=datMeta$Subject_ID, 
                       'Sex'=datMeta$Sex, 'Age'=datMeta$Age,
                       'Diagnosis'=datMeta$Diagnosis, 'Ethnicity'=datMeta$Ethnicity)

selectable_scatter_plot(plot_data, plot_data[,-c(1:3)])

Outlier samples: sample25, sample3, sample31

to_keep = z.ku > -2
datMeta = datMeta[to_keep,]
datExpr = datExpr[,to_keep]

rm(absadj, netsummary, ku, z.ku, plot_data)

Removed 3 samples, 49 remaining

rm(to_keep)



5. Remove repeated genes

There are no repeated genes in this dataset, so instead, I’m going to replace this step with:


5. Remove genes without or repeated ensembl ID

There are 194 genes without ensembl ID and 450

to_keep = !is.na(datGenes$ensembl_gene_id) & !duplicated(datGenes$ensembl_gene_id)

datExpr = datExpr %>% filter(to_keep)
datGenes = datGenes %>% filter(to_keep)
rownames(datExpr) = datGenes$ensembl_gene_id

rm(to_keep)




After filtering, the dataset consists of 15526 genes and 49 samples

Save filtered and annotated dataset

save(datExpr, datMeta, datGenes, file='./../Data/preprocessedData/filtered_raw_data.RData')
#load('./../Data/preprocessedData/filtered_raw_data.RData')




2.2.3 Batch effects modelling



Note: No batch correction is performed in this section, this is done after the normalisation step

Known sources of batch effects


There are no knowns sources of batch effects in this dataset



Unknown sources of batch effects


Using the sva function from limma following this tutorial

mod = model.matrix(~Diagnosis, data = datMeta)
sva_fit = sva(datExpr %>% as.matrix, mod)
## Number of significant surrogate variables is:  11 
## Iteration (out of 5 ):1  2  3  4  5
#mod = model.matrix(~Diagnosis, data = datMeta)
#mod0 = model.matrix(~1, data = datMeta)
#sva_fit = svaseq(exp(datExpr) %>% as.matrix, mod = mod, mod0 = mod0)

rm(mod)

Found 11 surrogate variables

Include SV estimations to datMeta information

sv_data = sva_fit$sv %>% data.frame
colnames(sv_data) = paste0('SV', 1:ncol(sv_data))

datMeta = cbind(datMeta, sv_data)

rm(sv_data, sva_fit)






2.2.4 Normalisation, differential expression analysis and data transformation



This data has already been normalised and transformed, so we are only going to perform the differential expression analysis.

Since our data has already been preprocessed, we cannot use DESeq2 to perform the differential expression analysis. instead I’m going to use limma following the procedure in RNA-seq analysis is easy as 1-2-3 with limma, Glimma and edgeR

fit = lmFit(datExpr, design=model.matrix(~SV1 + SV2 + SV3 + SV4 + SV5 + SV6 + SV7 + SV8 + SV9 + SV10 + SV11 + 
                                           Diagnosis, data = datMeta))
efit = eBayes(fit)
DE_info = topTable(efit, sort.by = 'none', n = Inf, coef = 'DiagnosisASD')
DE_info$shrunken_log2FoldChange = predFCm(efit, coef = 'DiagnosisASD')


rm(fit, efit)

DEA plots


The original LFC has a positive relation with the level of expression of the genes, which is inverted by the shrunken LFC (Perhaps this is a consequence of the inverted relation between mean and SD in this dataset?)

NOTE: The author of limma does not recommend to perform log fold change shrinkage in the DE step, claiming it is already over-done at the normalisation step and when it isn’t, it should be done in other stages of the processing pipeline, not here, so I’m not sure which of the LFC estimations is more reliable in this case. I’ll keep both (https://support.bioconductor.org/p/100804/)

p1 = data.frame('ID'=rownames(datExpr), 'meanExpr'=rowMeans(datExpr)) %>% 
     left_join(DE_info %>% data.frame %>% mutate(ID = rownames(.), significant = adj.P.Val<0.05), by='ID') %>%
     ggplot(aes(meanExpr, abs(logFC), color = significant)) + 
     geom_point(alpha = 0.1, shape = 1) + geom_smooth(alpha = 0.01, aes(color=significant)) +
     xlab('Mean Expression') + ylab('Original LFC Magnitude') + scale_y_sqrt() +
     theme_minimal() + theme(legend.position = 'none')

p2 = data.frame('ID'=rownames(datExpr), 'meanExpr'=rowMeans(datExpr)) %>% 
     left_join(DE_info %>% data.frame %>% mutate(ID = rownames(.), significant = adj.P.Val<0.05), by='ID') %>% 
     ggplot(aes(meanExpr, abs(shrunken_log2FoldChange), color = significant)) + scale_y_sqrt() +
     geom_point(alpha = 0.1, shape = 1) + geom_smooth(alpha = 0.2) + xlab('Mean Expression') + 
     ylab('Shrunken LFC Magnitude') + theme_minimal() + labs(color = 'DE')

ggarrange(p1,p2, nrow = 1, common.legend = TRUE, legend = 'bottom')

rm(p1, p2)


Data transformation plots


The data has a larger variance in genes with the lowest levels of expression than in the rest of the dataset

data.frame('ID'=rownames(datExpr), 'Mean'=rowMeans(datExpr), 'SD'=apply(datExpr,1,sd)) %>% ggplot(aes(Mean, SD)) + geom_point(color='#0099cc', alpha=0.2) + geom_smooth(color = 'gray') +
              theme_minimal()





2.2.5 Batch Effects Correction



By including the surrogate variables in the DESeq formula we only modelled the batch effects into the DEA, but we didn’t actually correct them from the data, for that we need to use ComBat (or other equivalent package) in the already normalised data

Unknown sources of batch effects


In some places they say you shouldn’t correct these effects on the data because you risk losing biological variation, in others they say you should because they introduce noise to the data. The only thing everyone agrees on is that you shouldn’t remove them before performing DEA but instead include them in the model.

Based on the conclusions from Practical impacts of genomic data “cleaning” on biological discovery using surrogate variable analysis it seems like it may be a good idea to remove the batch effects from the data and not only from the DE analysis:

  • Using SVA, ComBat or related tools can increase the power to identify specific signals in complex genomic datasets (they found “greatly sharpened global and gene-specific differential expression across treatment groups”)

  • But caution should be exercised to avoid removing biological signal of interest

  • We must be precise and deliberate in the design and analysis of experiments and the resulting data, and also mindful of the limitations we impose with our own perspective

  • Open data exploration is not possible after such supervised “cleaning”, because effects beyond those stipulated by the researcher may have been removed

Comparing data with and without surrogate variable correction

# Taken from https://www.biostars.org/p/121489/#121500
correctDatExpr = function(datExpr, mod, svs) {
  X = cbind(mod, svs)
  Hat = solve(t(X) %*% X) %*% t(X)
  beta = (Hat %*% t(datExpr))
  rm(Hat)
  gc()
  P = ncol(mod)
  return(datExpr - t(as.matrix(X[,-c(1:P)]) %*% beta[-c(1:P),]))
}

pca_samples_before = datExpr %>% t %>% prcomp
pca_genes_before = datExpr %>% prcomp

# Correct
mod = model.matrix(~ Diagnosis, datMeta)
svs = datMeta %>% dplyr::select(SV1:SV8) %>% as.matrix
datExpr_corrected = correctDatExpr(as.matrix(datExpr), mod, svs)

pca_samples_after = datExpr_corrected %>% t %>% prcomp
pca_genes_after = datExpr_corrected %>% prcomp

rm(correctDatExpr)
Samples

Removing batch effects improves the separation between diagnosis groups, but not as much as with Gandal’s dataset

pca_samples_df = rbind(data.frame('ID'=colnames(datExpr), 'PC1'=pca_samples_before$x[,1],
                                  'PC2'=pca_samples_before$x[,2], 'corrected'=0),
                       data.frame('ID'=colnames(datExpr), 'PC1'=pca_samples_after$x[,1],
                                  'PC2'=pca_samples_after$x[,2], 'corrected'=1)) %>%
                 left_join(datMeta %>% mutate('ID'=datMeta$Subject_ID), by='ID')

ggplotly(pca_samples_df %>% ggplot(aes(PC1, PC2, color=Diagnosis, shape=Diagnosis)) + 
         geom_point(aes(frame=corrected, id=ID), alpha=0.75) + 
         xlab(paste0('PC1 (corr=', round(cor(pca_samples_before$x[,1],pca_samples_after$x[,1]),2),
                     '). % Var explained: ', round(100*summary(pca_samples_before)$importance[2,1],1),' to ',
                     round(100*summary(pca_samples_after)$importance[2,1],1))) +
         ylab(paste0('PC2 (corr=', round(cor(pca_samples_before$x[,2],pca_samples_after$x[,2]),2),
                     '). % Var explained: ', round(100*summary(pca_samples_before)$importance[2,2],1),' to ',
                     round(100*summary(pca_samples_after)$importance[2,2],1))) +
         ggtitle('Samples') + theme_minimal())
rm(pca_samples_df)


Genes

It seems like the sva correction preserves the mean expression of the genes and erases almost everything else (although what little else remains is enough to characterise the two Diagnosis groups pretty well using only the first PC)

*Plot is done with only 10% of the genes so it’s not that heavy

pca_genes_df = rbind(data.frame('ID'=rownames(datExpr), 'PC1'=pca_genes_before$x[,1],
                                'PC2'=pca_genes_before$x[,2], 'corrected'=0, 'MeanExpr'=rowMeans(datExpr)),
                     data.frame('ID'=rownames(datExpr), 'PC1'=pca_genes_after$x[,1],
                                'PC2'=pca_genes_after$x[,2], 'corrected'=1, 'MeanExpr'=rowMeans(datExpr)))

keep_genes = rownames(datExpr) %>% sample(0.1*nrow(datExpr))

pca_genes_df = pca_genes_df %>% filter(ID %in% keep_genes)

ggplotly(pca_genes_df %>% ggplot(aes(PC1, PC2,color=MeanExpr)) + 
         geom_point(alpha=0.3, aes(frame=corrected, id=ID)) +
         xlab(paste0('PC1 (corr=', round(cor(pca_genes_before$x[,1],pca_genes_after$x[,1]),2),
                     '). % Var explained: ', round(100*summary(pca_genes_before)$importance[2,1],1),' to ',
                     round(100*summary(pca_genes_after)$importance[2,1],1))) +
         ylab(paste0('PC2 (corr=', round(cor(pca_genes_before$x[,2],pca_genes_after$x[,2]),2),
                     '). % Var explained: ', round(100*summary(pca_genes_before)$importance[2,2],1),' to ',
                     round(100*summary(pca_genes_after)$importance[2,2],1))) +
         scale_color_viridis() + ggtitle('Genes') + theme_minimal())
rm(pca_samples_before, pca_genes_before, mod, svs, pca_samples_after, pca_genes_after, pca_genes_df, keep_genes)

Everything looks good, so we’re keeping the corrected expression dataset

datExpr = datExpr_corrected

rm(datExpr_corrected)


Known sources of batch effects


There are no known sources of batch effects in this dataset




Save preprocessed dataset

# Join original and shrunken LFC values
genes_info = DE_info %>% data.frame %>%
             dplyr::rename('log2FoldChange' = logFC, 'padj' = adj.P.Val, 'baseMean' = AveExpr) %>% 
             mutate(significant=padj<0.05 & !is.na(padj)) %>% 
             add_column('ID'=rownames(datExpr), 'entrezgene'=datGenes$entrezgene, .before='baseMean') %>% 
             left_join(GO_neuronal, by='ID') %>%
             mutate(Neuronal=ifelse(is.na(Neuronal), 0, Neuronal))

save(datExpr, datMeta, datGenes, genes_info, file='./../Data/preprocessedData/preprocessed_data.RData')




Session info

sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.5 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
## 
## locale:
##  [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
##  [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
##  [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] limma_3.40.6                GEOquery_2.52.0            
##  [3] kableExtra_1.1.0            knitr_1.32                 
##  [5] expss_0.10.2                dendextend_1.13.4          
##  [7] vsn_3.52.0                  WGCNA_1.69                 
##  [9] fastcluster_1.1.25          dynamicTreeCut_1.63-1      
## [11] sva_3.32.1                  genefilter_1.66.0          
## [13] mgcv_1.8-31                 nlme_3.1-147               
## [15] DESeq2_1.24.0               SummarizedExperiment_1.14.1
## [17] DelayedArray_0.10.0         BiocParallel_1.18.1        
## [19] matrixStats_0.56.0          Biobase_2.44.0             
## [21] GenomicRanges_1.36.1        GenomeInfoDb_1.20.0        
## [23] IRanges_2.18.3              S4Vectors_0.22.1           
## [25] BiocGenerics_0.30.0         biomaRt_2.40.5             
## [27] ggpubr_0.2.5                magrittr_2.0.1             
## [29] ggExtra_0.9                 GGally_1.5.0               
## [31] gridExtra_2.3               viridis_0.5.1              
## [33] viridisLite_0.3.0           RColorBrewer_1.1-2         
## [35] plotlyutils_0.0.0.9000      plotly_4.9.2               
## [37] glue_1.4.2                  reshape2_1.4.4             
## [39] forcats_0.5.0               stringr_1.4.0              
## [41] dplyr_1.0.1                 purrr_0.3.4                
## [43] readr_1.3.1                 tidyr_1.1.0                
## [45] tibble_3.0.3                ggplot2_3.3.2              
## [47] tidyverse_1.3.0            
## 
## loaded via a namespace (and not attached):
##   [1] readxl_1.3.1           backports_1.1.8        Hmisc_4.4-0           
##   [4] plyr_1.8.6             lazyeval_0.2.2         splines_3.6.3         
##   [7] crosstalk_1.1.0.1      digest_0.6.27          foreach_1.5.0         
##  [10] htmltools_0.5.1.1      GO.db_3.8.2            fansi_0.4.1           
##  [13] checkmate_2.0.0        memoise_1.1.0          doParallel_1.0.15     
##  [16] cluster_2.1.0          annotate_1.62.0        modelr_0.1.6          
##  [19] prettyunits_1.1.1      jpeg_0.1-8.1           colorspace_1.4-1      
##  [22] blob_1.2.1             rvest_0.3.5            haven_2.2.0           
##  [25] xfun_0.22              crayon_1.3.4           RCurl_1.98-1.2        
##  [28] jsonlite_1.7.2         impute_1.58.0          iterators_1.0.12      
##  [31] survival_3.2-7         gtable_0.3.0           zlibbioc_1.30.0       
##  [34] XVector_0.24.0         webshot_0.5.2          scales_1.1.1          
##  [37] DBI_1.1.0              miniUI_0.1.1.1         Rcpp_1.0.6            
##  [40] xtable_1.8-4           progress_1.2.2         htmlTable_1.13.3      
##  [43] foreign_0.8-76         bit_4.0.4              preprocessCore_1.46.0 
##  [46] Formula_1.2-3          htmlwidgets_1.5.1      httr_1.4.2            
##  [49] acepack_1.4.1          ellipsis_0.3.1         farver_2.0.3          
##  [52] pkgconfig_2.0.3        reshape_0.8.8          XML_3.99-0.3          
##  [55] nnet_7.3-14            dbplyr_1.4.2           locfit_1.5-9.4        
##  [58] labeling_0.3           tidyselect_1.1.0       rlang_0.4.10          
##  [61] later_1.1.0.1          AnnotationDbi_1.46.1   munsell_0.5.0         
##  [64] cellranger_1.1.0       tools_3.6.3            cli_2.0.2             
##  [67] generics_0.0.2         RSQLite_2.2.0          broom_0.7.0           
##  [70] evaluate_0.14          fastmap_1.0.1          yaml_2.2.1            
##  [73] bit64_4.0.5            fs_1.5.0               mime_0.10             
##  [76] xml2_1.2.5             compiler_3.6.3         rstudioapi_0.11       
##  [79] curl_4.3               png_0.1-7              affyio_1.54.0         
##  [82] ggsignif_0.6.0         reprex_0.3.0           geneplotter_1.62.0    
##  [85] stringi_1.5.3          highr_0.8              lattice_0.20-41       
##  [88] Matrix_1.2-18          vctrs_0.3.2            pillar_1.4.6          
##  [91] lifecycle_0.2.0        BiocManager_1.30.10    cowplot_1.0.0         
##  [94] data.table_1.12.8      bitops_1.0-6           httpuv_1.5.5          
##  [97] affy_1.62.0            R6_2.5.0               latticeExtra_0.6-29   
## [100] promises_1.2.0.1       codetools_0.2-16       assertthat_0.2.1      
## [103] withr_2.2.0            GenomeInfoDbData_1.2.1 hms_0.5.3             
## [106] grid_3.6.3             rpart_4.1-15           rmarkdown_2.7         
## [109] shiny_1.4.0.2          lubridate_1.7.4        base64enc_0.1-3